Chroms

t1 <- Sys.time()
EH23a <- read.csv("../FigureSideo/EH23a.softmasked_nuccomp.csv")
EH23a$chrom_num <- sub(".+chr", "", EH23a$Id)
EH23a$chrom_num[ EH23a$chrom_num == "X" ] <- 10
EH23a$chrom_num <- as.numeric(EH23a$chrom_num)
EH23a[1:3, ]
EH23a[, 1:2]

EH23b <- read.csv("../FigureSideo/EH23b.softmasked_nuccomp.csv")
EH23b$chrom_num <- sub(".+chr", "", EH23b$Id)
EH23b$chrom_num[ EH23b$chrom_num == "X" ] <- 10
EH23b$chrom_num <- as.numeric(EH23b$chrom_num)
EH23b[1:3, ]

wins

#source("../FigureSideo/Rfunctions.R")
#EH23a_wins <- get_wins("../FigureSideo/")
#EH23b_wins <- get_wins("../FigureSideo/EH23b.softmasked_wins.csv")

Syri

syri <- read.table("/media/knausb/E737-9B48/knausb/mm2_maps/EH23a_EH23b/EH23a_EH23brcsyri.out",
                   sep = "\t", na.strings = "-")
names(syri) <- c("ref_chrom", "ref_start", "ref_end", "ref_seq", "query_seq", "query_chrom", "query_start", "query_end", "unique_ID", "parent_ID", "Annotation_Type", "Copy_Status")
#syri$ref_start[is.na(as.numeric(syri$ref_start))]
syri$ref_start <- as.numeric(syri$ref_start)
syri$ref_end <- as.numeric(syri$ref_end)
syri$query_start <- as.numeric(syri$query_start)
syri$query_end <- as.numeric(syri$query_end)

syri[1:3, ]
sort(table(syri$Annotation_Type), decreasing = TRUE)

syn <- syri[syri[, 11] == "SYN", ]
syn[1:3, ]
nrow(syn)
last_win <- 1
#coalesce <- 5e4
#coalesce <- 1e5
coalesce <- 4e5
#coalesce <- 1e6
for( j in 2:nrow(syn) ){
  if( syn$ref_start[j] - syn$ref_end[last_win] < coalesce &
      syn$query_start[j] - syn$query_end[last_win] < coalesce &
      syn$ref_chrom[j] == syn$ref_chrom[last_win] ){
        syn$ref_end[ last_win ] <- syn$ref_end[j]
        syn$query_end[ last_win ] <- syn$query_end[j]
        #syn$ref_end[j] <- NA
        syn$unique_ID[j] <- NA
  } else {
    last_win <- j
  }
}

#syn <- syn[ !is.na( syn$ref_end ), ]
syn <- syn[ !is.na( syn$unique_ID ), ]
syn[1:3, ]
nrow(syn)
min(syn$ref_end - syn$ref_start)
hist(syn$ref_end - syn$ref_start)

sort(syn$ref_end - syn$ref_start, decreasing = T)[1:10]

# names(syn) <- c("ref_chrom", "ref_start", "ref_end", "ref_seq", "query_seq", "query_chrom", "query_start", "query_end", "unique_ID", "parent_ID", "Annotation_Type", "Copy_Status")
# syn$ref_start <- as.numeric(syn$ref_start)
# syn$ref_end <- as.numeric(syn$ref_end)
# syn$query_start <- as.numeric(syn$query_start)
# syn$query_end <- as.numeric(syn$query_end)
syn[1:3, ]

inv <- syri[syri$Annotation_Type == "INV", ]
# tmp <- inv$query_start
# inv$query_start <- inv$query_end
# inv$query_end <- tmp
# inv[1:3, ]
#inv$query_end - inv$query_start

dup <- syri[syri$Annotation_Type == "DUP", ]
dup[1:3, ]
nrow(dup)
trans <- syri[syri$Annotation_Type == "TRANS", ]
trans[1:3, ]
nrow(trans)

gribbon

# nrow(syn)
# x <- syn[1:1000, ]
# nsmooth <- 10
# min_len <- 1e5

#gribbon <- function(x, nsmooth = 50, min_len = 1e5){
gribbon <- function(
    x, 
    nsmooth = 10, 
    min_len = 1e5, 
    coalesce = 0, 
    invert = FALSE){
  
  x <- x[(x$ref_end - x$ref_start) >= min_len, ]
  x <- x[(x$query_end - x$query_start) >= min_len, ]
  if( nrow(x) <= 0 ){
    return("No rows")
  }
  
  # if( coalesce > 0 ){
  #   last_win <- 1
  #   for( j in 2:nrow(x) ){
  #     if( x$ref_start[j] - x$ref_end[last_win] < coalesce &
  #         x$query_start[j] - x$query_end[last_win] < coalesce ){
  #       x$ref_end[ last_win ] <- x$ref_end[j]
  #       x$query_end[ last_win ] <- x$query_end[j]
  #       x$ref_end[j] <- NA
  #     } else {
  #       last_win <- j
  #     }
  #   }
  #   x <- x[ !is.na( x$ref_end ), ]
  # }
  
  if( invert ){
    tmp <- x$query_start
    x$query_start <- x$query_end
    x$query_end <- tmp
  }
  
  x$ref_chrom_num <- sub(".+chr", "", x$ref_chrom)
  x$ref_chrom_num[ x$ref_chrom_num == "X" ] <- 10
  x$ref_chrom_num[ x$ref_chrom_num == "Y" ] <- 11
  x$ref_chrom_num <- as.numeric(x$ref_chrom_num)
  x$ref_chrom_num <- x$ref_chrom_num - 0.2
  x$query_chrom_num <- sub(".+chr", "", x$query_chrom)
  x$query_chrom_num[ x$query_chrom_num == "X" ] <- 10
  x$query_chrom_num[ x$query_chrom_num == "Y" ] <- 11
  x$query_chrom_num <- as.numeric(x$query_chrom_num)
  x$query_chrom_num <- x$query_chrom_num + 0.2
  
#  nrow(x)
  my_x <- seq( from = -4, to = 4, length.out = nsmooth)
  my_y <- 1/( 1 + exp(1)^-my_x)
  my_x <- (my_x + 4)/8
  # plot(my_x, my_y)
  
#  my_x  <- c(my_x, rev(my_x))
  my_polys <- data.frame( 
    ID = rep(x$unique_ID, each = nsmooth * 2),
    ref_chrom_num = rep(x$ref_chrom_num, each = nsmooth * 2),
#    ref_start = rep(NA, each = nsmooth * 2),
#    ref_end = rep(NA, each = nsmooth * 2),
    query_chrom_num = rep(x$query_chrom_num, each = nsmooth * 2),
#    query_start = rep(NA, each = nsmooth * 2),
#    query_end = rep(NA, each = nsmooth * 2)
     poly_xs = rep(NA, each = nsmooth * 2),
     poly_ys = rep(NA, each = nsmooth * 2)
  )
  my_polys[1:3, ]
  
  my_vs <- seq(1, nrow(my_polys) + 1, by = nsmooth * 2)
#  for( i in seq(1, nrow(my_polys), by = nsmooth * 2) ){
  for( i in 1:nrow(x) ){
    my_index <- my_vs[i]:(my_vs[i+1] - 1)
    xmins <- seq( x$ref_chrom_num[i], x$query_chrom_num[i], length.out = nsmooth)
    
#    my_y * (x$query_end[i] - x$query_start[i]) + x$query_start[i]
    
    ymins <- my_y * (x$query_start[i] - x$ref_start[i]) + x$ref_start[i]
    ymaxs <- my_y * (x$query_end[i] - x$ref_end[i]) + x$ref_end[i]
    
    my_polys$poly_xs[my_index]    <- c(xmins, rev(xmins))
    my_polys$poly_ys[my_index] <- c(ymins, rev(ymaxs))
    #x$ref_start[ my_index ] <- seq(x$ref_chrom[i], x$query_chrom[i], length.out = nsmooth)
    
  }
#  my_polys[1:3, ]
  return( my_polys )  
}
#poly_coords <- gribbon(syn[1:1000, ])
#poly_coords <- gribbon(syn, min_len = 1e5)
#
nrow(syn)
## [1] 40
poly_coords <- gribbon(syn, min_len = 1e1, coalesce = 0)

#poly_coords <- gribbon(syn, min_len = 1e3)
# poly_coords <- gribbon(syn, nsmooth = 40, min_len = 1e3)
# poly_coords <- gribbon(syn, min_len = 1e1)
length(unique(poly_coords$ID))
## [1] 40
poly_coords_inv <- gribbon(inv, min_len = 1e4, invert = TRUE)
poly_coords_dup <- gribbon(dup, min_len = 1e4)
poly_coords_trans <- gribbon(trans, min_len = 1e4)

get_wins

cmd <- "~/gits/hempy/bin/fast_win.py /media/knausb/Vining_lab/knausb/mm2_maps/EH23a_EH23b_v2/EH23b.rc458X.softmasked.fasta.gz --win_size 1000000"
#system(cmd)
# sampn <- "EH23b"

get_wins <- function( sampn ){
#  sampd <- "../FigureSideo/EH23a.softmasked_wins.csv"
  wins_dir <- "../FigureSideo/"
  gffs_dir <- "/media/knausb/E737-9B48/releases/scaffolded/"
  
#  sampn <- unlist(lapply(strsplit(sampd, split = "/"), function(x){x[length(x)]}))
#  sampn <- unlist(lapply(strsplit(sampd, split = "//"), function(x){x[length(x)]}))
#  sampn <- sub(".softmasked_wins.csv", "", sampn)
#  sub(paste(sampn, ".softmasked_wins.csv", sep = ""), "", sampd)
  
  if( sampn == "EH23b" ){
    wins <- read.csv( "EH23b.rc458X.softmasked_wins.csv" )
    wins$Id <- sub("EH23a", "EH23b", wins$Id)
  } else {
    wins <- read.csv( paste( wins_dir, sampn, ".softmasked_wins.csv", sep = "") )
  }
#  wins <- read.csv( paste( sampn, ".softmasked_wins.csv", sep = "") )
#  wins <- read.csv( sampd )
#  wins <- wins[ grep( paste(sampn, ".chr", sep = ""), wins$Id ), ]
  wins$chrn <- sub(".+chr", "", wins$Id)
  wins$chrn[ wins$chrn == "X" ] <- 10
  wins$chrn[ wins$chrn == "Y" ] <- 11
  wins$chrn <- as.numeric(wins$chrn)
  #wins[1:3, ]
  
  # Scale and center.
  # Scaline of CG windows (chromosome width) is on a per chromosome basis.
  #  wins$CGs <- 0
  wins$CGs <- wins$CG / wins$Win_length * 100
  #  wins$notCG <- (wins$Win_length / 1) - wins$CG
  #  wins$notCGs <- 0
  for( j in unique(wins$Id) ){
    wins$CGs[ wins$Id == j] <- wins$CGs[ wins$Id == j] - min(wins$CGs[ wins$Id == j], na.rm = TRUE)
    wins$CGs[ wins$Id == j] <- wins$CGs[ wins$Id == j] / max(wins$CGs[ wins$Id == j], na.rm = TRUE)
    #
    # wins$CGs[ wins$Id == j] <- wins$CG[ wins$Id == j] - min(wins$CG[ wins$Id == j], na.rm = TRUE)
    # wins$CGs[ wins$Id == j] <- wins$CGs[ wins$Id == j]/max(wins$CGs[ wins$Id == j], na.rm = TRUE)
    #
    # wins$notCGs[ wins$Id == j] <- wins$notCG[ wins$Id == j] - min(wins$notCG[ wins$Id == j], na.rm = TRUE)
    # wins$notCGs[ wins$Id == j] <- wins$notCGs[ wins$Id == j]/max(wins$notCGs[ wins$Id == j], na.rm = TRUE)
  }
  wins$iCGs <- 1 - wins$CGs
  
  #wins$ATs <- 1 - wins$CGs
  #wins$ATs <- wins$Win_length/1e6 - wins$CGs
  
  # genes <- read.table( 
  #   paste(sampd, "/", sampn, ".primary_high_confidence.gff3.gz", sep = "" ), 
  #   sep = "\t" )
  genes <- read.table( 
    paste(gffs_dir, sampn, "/", sampn, ".primary_high_confidence.gff3.gz", sep = "" ), 
    sep = "\t" )
  genes <- genes[genes[, 3] == "gene", ]
  
  if( sampn == "EH23b" ){
    # Column 4 is starts.
    genes[ genes[, 1] == "EH23b.chr4", 4] <- EH23b$Length[ EH23b$Id == "EH23b.chr4" ] - genes[ genes[, 1] == "EH23b.chr4", 4]
    genes[ genes[, 1] == "EH23b.chr4", 5] <- EH23b$Length[ EH23b$Id == "EH23b.chr4" ] - genes[ genes[, 1] == "EH23b.chr4", 5]
    
        genes[ genes[, 1] == "EH23b.chr5", 4] <- EH23b$Length[ EH23b$Id == "EH23b.chr5" ] - genes[ genes[, 1] == "EH23b.chr5", 4]
    genes[ genes[, 1] == "EH23b.chr5", 5] <- EH23b$Length[ EH23b$Id == "EH23b.chr5" ] - genes[ genes[, 1] == "EH23b.chr5", 5]
    
        genes[ genes[, 1] == "EH23b.chr8", 4] <- EH23b$Length[ EH23b$Id == "EH23b.chr8" ] - genes[ genes[, 1] == "EH23b.chr8", 4]
    genes[ genes[, 1] == "EH23b.chr8", 5] <- EH23b$Length[ EH23b$Id == "EH23b.chr8" ] - genes[ genes[, 1] == "EH23b.chr8", 5]
    
        genes[ genes[, 1] == "EH23b.chrX", 4] <- EH23b$Length[ EH23b$Id == "EH23b.chrX" ] - genes[ genes[, 1] == "EH23b.chrX", 4]
    genes[ genes[, 1] == "EH23b.chrX", 5] <- EH23b$Length[ EH23b$Id == "EH23b.chrX" ] - genes[ genes[, 1] == "EH23b.chrX", 5]
    
  }
  
  genes[1:3, ]
  
  
  # Windowize
  wins$gcnt <- 0
  for(i in 1:nrow(wins)){
    # if( sampn == "EH23b" ){
    #   wins$Id <- sub("EH23a", "EH23b", wins$Id)
    # }
    tmp <- genes[genes$V1 == wins$Id[i] & genes$V4 >= wins$Start[i] & genes$V5 < wins$End[i], ]
    wins$gcnt[i] <- nrow(tmp)
  }
  
  # Scaling of gene count windows is on a per assembly basis.
  wins$gcntsc <- wins$gcnt - min(wins$gcnt)
  wins$gcntsc <- wins$gcntsc / max(wins$gcntsc)
  my_index <- round( wins$gcntsc * 100 )
  my_index[ my_index <= 0] <- 1
  
  # Color ramp.
  #wins$gcol <- heat.colors(n=100)[ my_index ]
  #wins$gcol <- colorRampPalette(c("yellow", "orange", "red"))( 100 )[ my_index ]
  #wins$gcol <- colorRampPalette(c("red", "orange", "yellow"))( 100 )[ my_index ]
  #wins$gcol <- colorRampPalette(c("#0000FF", "#228B22", "#A0522D"))( 100 )[ my_index ]
  #wins$gcol <- colorRampPalette(c("#87CEEB", "#3CB371", "#228B22", "#A0522D"))( 100 )[ my_index ]
  
  #wins$gcol <- viridisLite::plasma(n = 100, alpha = 1, begin = 0.1, end = 1)[ my_index ]
  wins$gcol <- viridisLite::magma(n = 100, alpha = 1, begin = 0.2, end = 1.00)[ my_index ]
  
  return(wins)
}
EH23a_wins <- get_wins("EH23a")
EH23a_wins[1:3, ]
EH23b_wins <- get_wins("EH23b")
EH23b_wins[1:3, ]

Windowize SNPs

EH23a_wins[1:3, ]
##           Id Win_number   Start     End Win_length      A      C      G      T
## 1 EH23a.chr1          0       0  999999    1000000 339510 159349 157789 343352
## 2 EH23a.chr1          1 1000000 1999999    1000000 346184 152100 152139 349577
## 3 EH23a.chr1          2 2000000 2999999    1000000 343278 155236 154692 346794
##      CG   CHG   CHH chrn        CGs      iCGs gcnt    gcntsc      gcol
## 1 14003 19695 90544    1 0.07933312 0.9206669  156 1.0000000 #FCFDBFFF
## 2 13018 18441 88693    1 0.00000000 1.0000000  141 0.9038462 #FED799FF
## 3 14100 19606 89229    1 0.08714562 0.9128544  123 0.7884615 #FEAD77FF
snps <- syri[syri$Annotation_Type == "SNP", ]
nrow(snps)
## [1] 3660371
snps[1:3, ]
##     ref_chrom ref_start ref_end ref_seq query_seq query_chrom query_start
## 5  EH23a.chr1      6376    6376       c         t  EH23a.chr1       43110
## 8  EH23a.chr1      6542    6542       A         G  EH23a.chr1       43276
## 11 EH23a.chr1      6779    6779       t         c  EH23a.chr1       43514
##    query_end unique_ID parent_ID Annotation_Type Copy_Status
## 5      43110  SNP21930      SYN1             SNP        <NA>
## 8      43276  SNP21933      SYN1             SNP        <NA>
## 11     43514  SNP21936      SYN1             SNP        <NA>
EH23a_wins$snps <- 0

for(i in 1:nrow(EH23a_wins)){
  tmp <- snps[ snps$ref_chrom == EH23a_wins$Id[i] & snps$ref_start >= EH23a_wins$Start[i] & snps$ref_end <= EH23a_wins$End[i], ]
  EH23a_wins$snps[i] <- nrow(tmp)
}
EH23a_wins$snpssc <- (EH23a_wins$snps - min(EH23a_wins$snps))/max(EH23a_wins$snps)

hist(EH23a_wins$snps)

hist(EH23a_wins$snps[ EH23a_wins$Id == "EH23a.chr4" ])

my_hist <- hist(EH23a_wins$snpssc * 9 + 1, plot = F)

#barplot(my_hist$density, space = 0, col = gray.colors(n=10))
#axis(side=1)
barplot(my_hist$density ~ my_hist$breaks[-10], space = 0, col = gray.colors(n=10))

BLAST

sampn <- "EH23a"
ablstn <- read.csv(
    paste("/media/knausb/Vining_lab/knausb/blast_projects/todd_rrna/blastn_uc/", sampn, "_blastn.csv", sep = ""),
    header = FALSE)
colnames(ablstn) <- c("qseqid","qlen","sseqid","slen","qstart","qend",
                     "sstart","send","evalue","bitscore","score","length",
                     "pident","nident","mismatch","positive","gapopen",
                     "gaps","ppos","sstrand")
ablstn <- ablstn[grep("chr", ablstn$sseqid), ]
ablstn$chrn <- sub(".*chr", "", ablstn$sseqid)
ablstn$chrn[ablstn$chrn == "X"] <- 10
ablstn$chrn[ablstn$chrn == "Y"] <- 11
ablstn$chrn <- as.numeric(ablstn$chrn)


sampn <- "EH23b"
bblstn <- read.csv(
    paste("/media/knausb/Vining_lab/knausb/blast_projects/todd_rrna/blastn_uc/", sampn, "_blastn.csv", sep = ""),
    header = FALSE)
colnames(bblstn) <- c("qseqid","qlen","sseqid","slen","qstart","qend",
                     "sstart","send","evalue","bitscore","score","length",
                     "pident","nident","mismatch","positive","gapopen",
                     "gaps","ppos","sstrand")
bblstn <- bblstn[grep("chr", bblstn$sseqid), ]
bblstn$chrn <- sub(".*chr", "", bblstn$sseqid)
bblstn$chrn[bblstn$chrn == "X"] <- 10
bblstn$chrn[bblstn$chrn == "Y"] <- 11
bblstn$chrn <- as.numeric(bblstn$chrn)

my_chrom <- "EH23b.chr4"
bblstn$sstart[ bblstn$sseqid == my_chrom ] <- EH23b$Length[ EH23b$Id == my_chrom ] - bblstn$sstart[ bblstn$sseqid == my_chrom ]
bblstn$send[ bblstn$sseqid == my_chrom ] <- EH23b$Length[ EH23b$Id == my_chrom ] - bblstn$send[ bblstn$sseqid == my_chrom ]
my_chrom <- "EH23b.chr5"
bblstn$sstart[ bblstn$sseqid == my_chrom ] <- EH23b$Length[ EH23b$Id == my_chrom ] - bblstn$sstart[ bblstn$sseqid == my_chrom ]

bblstn$send[ bblstn$sseqid == my_chrom ] <- EH23b$Length[ EH23b$Id == my_chrom ] - bblstn$send[ bblstn$sseqid == my_chrom ]
my_chrom <- "EH23b.chr8"
bblstn$sstart[ bblstn$sseqid == my_chrom ] <- EH23b$Length[ EH23b$Id == my_chrom ] - bblstn$sstart[ bblstn$sseqid == my_chrom ]
bblstn$send[ bblstn$sseqid == my_chrom ] <- EH23b$Length[ EH23b$Id == my_chrom ] - bblstn$send[ bblstn$sseqid == my_chrom ]
my_chrom <- "EH23b.chrX"
bblstn$sstart[ bblstn$sseqid == my_chrom ] <- EH23b$Length[ EH23b$Id == my_chrom ] - bblstn$sstart[ bblstn$sseqid == my_chrom ]
bblstn$send[ bblstn$sseqid == my_chrom ] <- EH23b$Length[ EH23b$Id == my_chrom ] - bblstn$send[ bblstn$sseqid == my_chrom ]

sampn <- "EH23b"

Plot

library(ggplot2)

chrom_wid <- 0.05
p <- ggplot2::ggplot()
p <- p + theme_bw()
p <- p + ggplot2::geom_rect( 
  data = EH23a, 
  ggplot2::aes( xmin = chrom_num - chrom_wid - 0.2,
                xmax = chrom_num + chrom_wid - 0.2,
                ymin = 1, ymax = Length),
  fill = "#DCDCDC",
  color = "#000000"
)

p <- p + ggplot2::geom_rect( 
  data = EH23b, 
  ggplot2::aes( xmin = chrom_num - chrom_wid + 0.2,
                xmax = chrom_num + chrom_wid + 0.2,
                ymin = 1, ymax = Length),
  fill = "#DCDCDC",
  color = "#000000"
)

# EH23a_wins[1:3, ]

my_cols <- gray.colors(n=10, start = 0.3, end = 0.9)[round(EH23a_wins$snpssc * 9 + 1)]
#my_cols <- gray.colors(n=100, start = 0.3, end = 0.9)[round(EH23a_wins$snpssc * 99 + 1)]
#p <- 
p + ggplot2::geom_rect( 
  data = EH23a_wins, 
  ggplot2::aes( xmin = chrn - 0.5,
                xmax = chrn - 0.25,
                ymin = Start, ymax = End),
#  fill = "#DCDCDC",
  fill = my_cols,
#  color = "#000000"
)

# Theme
p <- p + ggplot2::scale_x_continuous( 
  breaks = EH23a$chrom_num,
#  limits = c(0.6, 10.4),
#  limits = c(0.5, 10.5),
#  labels = EH23a$Id
  labels = sub("a.chr", ".chr", EH23a$Id)
)
  
p <- p + scale_y_continuous(
  breaks = seq( 0, 120e6, by = 10e6), 
  labels = seq( 0, 120, by = 10)
)
  
#  p <- p + ggplot2::theme_bw() + 
p <- p + ggplot2::theme( 
      #      panel.grid.minor.x = ggplot2::element_blank(), 
    panel.grid.minor.x = ggplot2::element_line( linewidth = 0.4, color = "#C0C0C0", linetype = 3 ),
    axis.text.x = element_text(angle = 60, hjust = 1),
    axis.title.x=element_blank(),
    panel.grid.major.y = ggplot2::element_line( linewidth = 0.4, color = "#C0C0C0", linetype = 1 ),
    panel.grid.minor.y = ggplot2::element_line( linewidth = 0.4, color = "#C0C0C0", linetype = 3 )
  )
  
p <- p + ggplot2::ylab("Position (Mbp)")
p <- p + ggtitle( "EH23" )

p

#p <- p + theme(legend.position='none')
p <- p + geom_polygon( 
  data = poly_coords, 
  aes(x = poly_xs, y = poly_ys, fill = ID, group = ID), #, fill = col),
  alpha = 2/5,
  show.legend = FALSE,
  #color = NA,
#  color = "#80808022",
#  color = "#000000",
  color = "#696969",
#  color = "#808080",
#  color = "#A9A9A9",
#  color = "#C0C0C0",
  linewidth = 0.4,
#            fill = ID
#  fill = "#C0C0C044"
  fill = "#808080"
  )

p

p <- p + geom_polygon( 
  data = poly_coords_inv, 
  aes(x = poly_xs, y = poly_ys, fill = ID, group = ID),
  alpha = 0.9,
  show.legend = FALSE,
  #color = NA,
#  color = "#80808022",
  #color = "#000000",
#  linewidth = 0.1,
#            fill = ID
#  fill = "#C0C0C044"
  fill = "#FFA500"
#  fill = "#808080"
  )
p

# p + geom_polygon( 
#   data = poly_coords_dup, 
#   aes(x = poly_xs, y = poly_ys, fill = ID, group = ID),
#   alpha = 0.9,
#   show.legend = FALSE,
#   fill = "#1E90FF"
#   )

# p + geom_polygon( 
#   data = poly_coords_trans, 
#   aes(x = poly_xs, y = poly_ys, fill = ID, group = ID),
#   alpha = 0.9,
#   show.legend = FALSE,
#   fill = "#1E90FF"
#   )


thinw <- 0.28
p <- p + ggplot2::geom_rect( 
  data = EH23a_wins, 
  ggplot2::aes(
    xmin = chrn - iCGs * thinw - 0.2,
    #xmax = chrn + iCGs * thinw,
    xmax = chrn - 0.2,
    ymin = Start, 
    ymax = End),
  fill = EH23a_wins$gcol,
  color = NA
)

p <- p + ggplot2::geom_rect( 
  data = EH23b_wins, 
  ggplot2::aes(
    #xmin = chrn - iCGs * thinw - 0.2,
    xmin = chrn + 0.2,
    xmax = chrn + iCGs * thinw + 0.2,
    #xmax = chrn - 0.2,
    ymin = Start, 
    ymax = End),
  fill = EH23b_wins$gcol,
  color = NA
)
p

#p + xlim(3.5, 4.5) #+ ylim(1e6, 2e6)

p + xlim(5.5, 8.5) + ylim(10e6, 40e6)
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
## Warning: Removed 10 rows containing missing values (`geom_rect()`).
## Removed 10 rows containing missing values (`geom_rect()`).
## Warning: Removed 654 rows containing missing values (`geom_rect()`).
## Warning: Removed 653 rows containing missing values (`geom_rect()`).

#pEH23 <- p
#save(pEH23, file = "ideo.RData")
#load("ideo.RData")
#pEH23
# Cannabinoids
my_rows <- grep("AB2|LY|NC_044376.1:c427", ablstn$qseqid)
blstw <- 0.48
p <- p + ggplot2::geom_rect( 
    data = ablstn[my_rows, ], 
    ggplot2::aes( 
      xmin = chrn - blstw,
      xmax = chrn + 0 - 0.14,
      ymin = sstart, 
      ymax = send),
    fill = "#228B22",
    color = '#228B22'
  )
my_rows <- grep("AB2|LY|NC_044376.1:c427", bblstn$qseqid)
blstw <- 0.48
p <- p + ggplot2::geom_rect( 
    data = bblstn[my_rows, ], 
    ggplot2::aes( 
      xmin = chrn + 0.14,
      xmax = chrn + blstw,
      ymin = sstart, 
      ymax = send),
    fill = "#228B22",
    color = '#228B22'
  )

# 5S
my_rows <- grep("5S_", ablstn$qseqid)
blstw <- 0.48
p <- p + ggplot2::geom_rect( 
  data = ablstn[my_rows, ], 
  ggplot2::aes( 
    xmin = chrn - blstw,
    xmax = chrn - 0.1,
    ymin = sstart, 
    ymax = send),
  fill = "#000000",
  color = '#000000'
)
my_rows <- grep("5S_", bblstn$qseqid)
blstw <- 0.48
p <- p + ggplot2::geom_rect( 
  data = bblstn[my_rows, ], 
  ggplot2::aes( 
    xmin = chrn + 0.1,
    xmax = chrn + blstw,
    ymin = sstart, 
    ymax = send),
  fill = "#000000",
  color = '#000000'
)

# 45S
my_rows <- grep("45s_|26s_|5.8s_|18s_", ablstn$qseqid)
blstw <- 0.48
p <- p + ggplot2::geom_rect( 
  data = ablstn[my_rows, ], 
  ggplot2::aes( 
    xmin = chrn - blstw,
    xmax = chrn - 0.1,
    ymin = sstart, 
    ymax = send),
  fill = "#B22222",
  linewidth = 2,
  #    linewidth = 4,
  color = '#B22222'
)
my_rows <- grep("45s_|26s_|5.8s_|18s_", bblstn$qseqid)
blstw <- 0.48
p <- p + ggplot2::geom_rect( 
  data = bblstn[my_rows, ], 
  ggplot2::aes( 
    xmin = chrn + 0.1,
    xmax = chrn + blstw,
    ymin = sstart, 
    ymax = send),
  fill = "#B22222",
  linewidth = 2,
  #    linewidth = 4,
  color = '#B22222'
)

# Centromeres, subtelomeric repeats
my_rows <- grep("AH3Ma_CS-237_satelite|CsatSD_centromere_237bp|CsatSD_centromere_370bp", ablstn$qseqid)
#  blstw <- 0.48
p <- p + ggplot2::geom_rect( 
  data = ablstn[my_rows, ], 
  ggplot2::aes( 
    xmin = chrn - 0.5,
    xmax = chrn - 0.2,
    ymin = sstart, 
    ymax = send),
  fill = "#1E90FF",
  color = '#1E90FF'
)
my_rows <- grep("AH3Ma_CS-237_satelite|CsatSD_centromere_237bp|CsatSD_centromere_370bp", bblstn$qseqid)
p <- p + ggplot2::geom_rect( 
  data = bblstn[my_rows, ], 
  ggplot2::aes( 
    xmin = chrn + 0.2,
    xmax = chrn + 0.5,
    ymin = sstart, 
    ymax = send),
  fill = "#1E90FF",
  color = '#1E90FF'
)
pEH23 <- p
pEH23

Sex chromosomes

#list.files("/media/knausb/Vining_lab/knausb/mm2_maps/salk_chrY/orientation2/")

# ape::read.FASTA("/media/knausb/Vining_lab/knausb/mm2_maps/salk_chrY/orientation2/EH23a.chrXrevcomp.fasta")
# ape::read.FASTA("/media/knausb/Vining_lab/knausb/mm2_maps/salk_chrY/orientation2/AH3Ma.chrX.fasta")
# ape::read.FASTA("/media/knausb/Vining_lab/knausb/mm2_maps/salk_chrY/orientation2/AH3Mb.chrYrevcomp.fasta")
# ape::read.FASTA("/media/knausb/Vining_lab/knausb/mm2_maps/salk_chrY/orientation2/BCMb.chrYrevcomp.fasta")

chrom_df <- data.frame( 
  Id = c('EH23a.chrXrevcomp', 'AH3Ma.chrX', 'AH3Mb.chrYrevcomp', 'BCMb.chrYrevcomp'),
  Length = c(84640807, 84231629, 110682302, 107756508),
  Pos = c(1, 2, 3, 4))
EH23a.chrX_AH3Ma_syri <- read.table("/media/knausb/Vining_lab/knausb/mm2_maps/salk_chrY/orientation2/EH23a.chrXrevcomp_AH3Ma.chrXsyri.out",
                   sep = "\t", na.strings = "-")
names(EH23a.chrX_AH3Ma_syri) <- c("ref_chrom", "ref_start", "ref_end", "ref_seq", "query_seq", "query_chrom", "query_start", "query_end", "unique_ID", "parent_ID", "Annotation_Type", "Copy_Status")
#syri$ref_start[is.na(as.numeric(syri$ref_start))]

EH23a.chrX_AH3Ma_syri$ref_chrom <- paste("EH23a.", EH23a.chrX_AH3Ma_syri$ref_chrom, sep="")
EH23a.chrX_AH3Ma_syri$ref_start <- as.numeric(EH23a.chrX_AH3Ma_syri$ref_start)
EH23a.chrX_AH3Ma_syri$ref_end <- as.numeric(EH23a.chrX_AH3Ma_syri$ref_end)

EH23a.chrX_AH3Ma_syri$query_chrom <- paste("AH3Ma.", EH23a.chrX_AH3Ma_syri$query_chrom, sep="")
EH23a.chrX_AH3Ma_syri$query_start <- as.numeric(EH23a.chrX_AH3Ma_syri$query_start)
EH23a.chrX_AH3Ma_syri$query_end <- as.numeric(EH23a.chrX_AH3Ma_syri$query_end)


syri <- read.table("/media/knausb/Vining_lab/knausb/mm2_maps/salk_chrY/orientation2/AH3Ma.chrX_AH3Mb.chrYrevcompsyri.out",
                   sep = "\t", na.strings = "-")
names(syri) <- c("ref_chrom", "ref_start", "ref_end", "ref_seq", "query_seq", "query_chrom", "query_start", "query_end", "unique_ID", "parent_ID", "Annotation_Type", "Copy_Status")
#syri$ref_start[is.na(as.numeric(syri$ref_start))]
syri$ref_chrom <- paste("AH3Ma.", syri$ref_chrom, sep="")
syri$ref_start <- as.numeric(syri$ref_start)
syri$ref_end <- as.numeric(syri$ref_end)
syri$query_chrom <- paste("AH3Mb.", syri$query_chrom, sep="")
syri$query_start <- as.numeric(syri$query_start)
syri$query_end <- as.numeric(syri$query_end)
AH3Ma.chrX_AH3Mb.chrY_syri <- syri

syri <- read.table("/media/knausb/Vining_lab/knausb/mm2_maps/salk_chrY/orientation2/AH3Mb.chrYrevcomp_BCMb.chrYrevcompsyri.out",
                   sep = "\t", na.strings = "-")
names(syri) <- c("ref_chrom", "ref_start", "ref_end", "ref_seq", "query_seq", "query_chrom", "query_start", "query_end", "unique_ID", "parent_ID", "Annotation_Type", "Copy_Status")
#syri$ref_start[is.na(as.numeric(syri$ref_start))]
syri$ref_chrom <- paste("AH3Mb.", syri$ref_chrom, sep="")
syri$ref_start <- as.numeric(syri$ref_start)
syri$ref_end <- as.numeric(syri$ref_end)
syri$query_chrom <- paste("BCMb.", syri$query_chrom, sep="")
syri$query_start <- as.numeric(syri$query_start)
syri$query_end <- as.numeric(syri$query_end)
AH3Mb.chrYrevcomp_BCMb.chrY_syri <- syri
coalesce_rows <- function(syn){
  last_win <- 1
  coalesce <- 4e5
  for( j in 2:nrow(syn) ){
    if( syn$ref_start[j] - syn$ref_end[last_win] < coalesce &
      syn$query_start[j] - syn$query_end[last_win] < coalesce &
      syn$ref_chrom[j] == syn$ref_chrom[last_win] ){
      syn$ref_end[ last_win ] <- syn$ref_end[j]
      syn$query_end[ last_win ] <- syn$query_end[j]
      #syn$ref_end[j] <- NA
      syn$unique_ID[j] <- NA
    } else {
      last_win <- j
    }
  }

#syn <- syn[ !is.na( syn$ref_end ), ]
  syn <- syn[ !is.na( syn$unique_ID ), ]
  return(syn)
}
table(EH23a.chrX_AH3Ma_syri$Annotation_Type)
## 
##     CPG     CPL     DEL     DUP   DUPAL     HDR     INS     INV   INVAL   INVDP 
##      56     173   30806     900    1251    2259   30325      14     142     823 
## INVDPAL   INVTR INVTRAL   NOTAL     SNP     SYN   SYNAL     TDM   TRANS TRANSAL 
##    1156     423     591    5627  331654    1687    3873      12     421     592
EH23a.chrX_AH3Ma_syn <- EH23a.chrX_AH3Ma_syri[EH23a.chrX_AH3Ma_syri$Annotation_Type == "SYN", ]
EH23a.chrX_AH3Ma_syn <- coalesce_rows(EH23a.chrX_AH3Ma_syn)
#EH23a.chrX_AH3Ma_syn[1:3, ]
EH23a.chrX_AH3Ma_syn_poly_coords <- gribbon(EH23a.chrX_AH3Ma_syn, min_len = 1e1, coalesce = 0)
EH23a.chrX_AH3Ma_syn_poly_coords$poly_xs <- (EH23a.chrX_AH3Ma_syn_poly_coords$poly_xs - 9.8)/0.4 + 1
#EH23a.chrX_AH3Ma_syn_poly_coords[1:3, ]

AH3Ma_AH3Mb_syn <- AH3Ma.chrX_AH3Mb.chrY_syri[AH3Ma.chrX_AH3Mb.chrY_syri$Annotation_Type == "SYN", ]
AH3Ma_AH3Mb_syn <- coalesce_rows(AH3Ma_AH3Mb_syn)
AH3Ma_AH3Mb_syn_poly_coords <- gribbon(AH3Ma_AH3Mb_syn, min_len = 1e1, coalesce = 0)
AH3Ma_AH3Mb_syn_poly_coords$poly_xs <- (AH3Ma_AH3Mb_syn_poly_coords$poly_xs - 9.8)/0.4 + 1
AH3Ma_AH3Mb_syn_poly_coords$poly_xs <- AH3Ma_AH3Mb_syn_poly_coords$poly_xs + 1

AH3Mb_BCMb_syn <- AH3Mb.chrYrevcomp_BCMb.chrY_syri[AH3Mb.chrYrevcomp_BCMb.chrY_syri$Annotation_Type == "SYN", ]
AH3Mb_BCMb_syn <- coalesce_rows(AH3Mb_BCMb_syn)
AH3Mb_BCMb_syn_poly_coords <- gribbon(AH3Mb_BCMb_syn, min_len = 1e1, coalesce = 0)
AH3Mb_BCMb_syn_poly_coords$poly_xs <- (AH3Mb_BCMb_syn_poly_coords$poly_xs - 9.8)/0.4 + 1
AH3Mb_BCMb_syn_poly_coords$poly_xs <- AH3Mb_BCMb_syn_poly_coords$poly_xs + 2
EH23a.chrX_AH3Ma_inv <- EH23a.chrX_AH3Ma_syri[EH23a.chrX_AH3Ma_syri$Annotation_Type == "INV", ]
#EH23a.chrX_AH3Ma_inv <- coalesce_rows(EH23a.chrX_AH3Ma_inv)
#EH23a.chrX_AH3Ma_syn[1:3, ]
EH23a.chrX_AH3Ma_inv_poly_coords <- gribbon(EH23a.chrX_AH3Ma_inv, min_len = 1e1, coalesce = 0, invert = TRUE)
EH23a.chrX_AH3Ma_inv_poly_coords$poly_xs <- (EH23a.chrX_AH3Ma_inv_poly_coords$poly_xs - 9.8)/0.4 + 1
#EH23a.chrX_AH3Ma_syn_poly_coords[1:3, ]

AH3Ma_AH3Mb_inv <- AH3Ma.chrX_AH3Mb.chrY_syri[AH3Ma.chrX_AH3Mb.chrY_syri$Annotation_Type == "INV", ]
#AH3Ma_AH3Mb_inv <- coalesce_rows(AH3Ma_AH3Mb_inv)
AH3Ma_AH3Mb_inv_poly_coords <- gribbon(AH3Ma_AH3Mb_inv, min_len = 1e1, coalesce = 0, invert = TRUE)
AH3Ma_AH3Mb_inv_poly_coords$poly_xs <- (AH3Ma_AH3Mb_inv_poly_coords$poly_xs - 9.8)/0.4 + 1
AH3Ma_AH3Mb_inv_poly_coords$poly_xs <- AH3Ma_AH3Mb_inv_poly_coords$poly_xs + 1

AH3Mb_BCMb_inv <- AH3Mb.chrYrevcomp_BCMb.chrY_syri[AH3Mb.chrYrevcomp_BCMb.chrY_syri$Annotation_Type == "INV", ]
#AH3Mb_BCMb_inv <- coalesce_rows(AH3Mb_BCMb_inv)
AH3Mb_BCMb_inv_poly_coords <- gribbon(AH3Mb_BCMb_inv, min_len = 1e1, coalesce = 0, invert = TRUE)
AH3Mb_BCMb_inv_poly_coords$poly_xs <- (AH3Mb_BCMb_inv_poly_coords$poly_xs - 9.8)/0.4 + 1
AH3Mb_BCMb_inv_poly_coords$poly_xs <- AH3Mb_BCMb_inv_poly_coords$poly_xs + 2
dup <- EH23a.chrX_AH3Ma_syri[EH23a.chrX_AH3Ma_syri$Annotation_Type == "DUP", ]
dup_poly_coords <- gribbon(dup, min_len = 1e1, coalesce = 0, invert = FALSE)
dup_poly_coords$poly_xs <- (dup_poly_coords$poly_xs - 9.8)/0.4 + 1
dup_poly_coords$poly_xs <- dup_poly_coords$poly_xs + 0
EH23a.chrX_AH3Ma_dup_poly_coords <- dup_poly_coords

dup <- AH3Ma.chrX_AH3Mb.chrY_syri[AH3Ma.chrX_AH3Mb.chrY_syri$Annotation_Type == "DUP", ]
dup_poly_coords <- gribbon(dup, min_len = 1e1, coalesce = 0, invert = FALSE)
dup_poly_coords$poly_xs <- (dup_poly_coords$poly_xs - 9.8)/0.4 + 1
dup_poly_coords$poly_xs <- dup_poly_coords$poly_xs + 1
AH3Ma_AH3Mb_dup_poly_coords <- dup_poly_coords

dup <- AH3Mb.chrYrevcomp_BCMb.chrY_syri[AH3Mb.chrYrevcomp_BCMb.chrY_syri$Annotation_Type == "DUP", ]
dup_poly_coords <- gribbon(dup, min_len = 1e1, coalesce = 0, invert = FALSE)
dup_poly_coords$poly_xs <- (dup_poly_coords$poly_xs - 9.8)/0.4 + 1
dup_poly_coords$poly_xs <- dup_poly_coords$poly_xs + 2
AH3Mb_BCMb_dup_poly_coords <- dup_poly_coords
library(ggplot2)
chrom_wid <- 0.05
p <- ggplot2::ggplot()
p <- p + theme_bw()
p <- p + ggplot2::geom_rect( 
  data = chrom_df, 
  ggplot2::aes( xmin = Pos - chrom_wid,
                xmax = Pos + chrom_wid,
                ymin = 1, ymax = Length),
  fill = "#DCDCDC",
  color = "#000000"
)

# Theme
p <- p + ggplot2::scale_x_continuous( 
  labels = sub("\\.", "\n", sub("revcomp$", "", chrom_df$Id))
)
p <- p + scale_y_continuous(
  breaks = seq( 0, 120e6, by = 20e6), 
  labels = seq( 0, 120, by = 20)
)
  
#  p <- p + ggplot2::theme_bw() + 
p <- p + ggplot2::theme( 
      #      panel.grid.minor.x = ggplot2::element_blank(), 
    panel.grid.minor.x = ggplot2::element_line( linewidth = 0.4, color = "#C0C0C0", linetype = 3 ),
    axis.text.x = element_text(angle = 60, hjust = 1),
    axis.title.x=element_blank(),
    panel.grid.major.y = ggplot2::element_line( linewidth = 0.4, color = "#C0C0C0", linetype = 1 ),
#    panel.grid.minor.y = ggplot2::element_line( linewidth = 0.4, color = "#C0C0C0", linetype = 3 )
    panel.grid.minor.y = ggplot2::element_blank()
  )
  
p <- p + ggplot2::ylab("Position (Mbp)")
#p <- p + ggtitle( "Chromosomes X and Y" )

p

p <- p + geom_polygon( 
  data = EH23a.chrX_AH3Ma_syn_poly_coords, 
  aes(x = poly_xs, y = poly_ys, fill = ID, group = ID),
  alpha = 2/5,
  show.legend = FALSE,
#  color = "#696969",
#  linewidth = 0.4,
  fill = "#808080"
)

p <- p + geom_polygon( 
  data = EH23a.chrX_AH3Ma_inv_poly_coords, 
  aes(x = poly_xs, y = poly_ys, fill = ID, group = ID),
  alpha = 2/5,
  show.legend = FALSE,
#  color = "#696969",
#  linewidth = 0.4,
  color = "#FFA500",
  fill = "#FFA500"
#  fill = "#808080"
)

p <- p + geom_polygon( 
  data = EH23a.chrX_AH3Ma_dup_poly_coords, 
  aes(x = poly_xs, y = poly_ys, fill = ID, group = ID),
  alpha = 2/5,
  show.legend = FALSE,
  color = "#1E90FF44",
#  color = "#696969",
  linewidth = 0.1,
  fill = "#1E90FF"
#  fill = "#808080"
)

p <- p + geom_polygon( 
  data = AH3Ma_AH3Mb_syn_poly_coords, 
  aes(x = poly_xs, y = poly_ys, fill = ID, group = ID),
  alpha = 2/5,
  show.legend = FALSE,
#  color = "#696969",
#  linewidth = 0.4,
  fill = "#808080"
)

p <- p + geom_polygon( 
  data = AH3Ma_AH3Mb_inv_poly_coords, 
  aes(x = poly_xs, y = poly_ys, fill = ID, group = ID),
  alpha = 2/5,
  show.legend = FALSE,
#  color = "#696969",
#  linewidth = 0.4,
  color = "#FFA500",
  fill = "#FFA500"
#  fill = "#808080"
)

p <- p + geom_polygon( 
  data = AH3Ma_AH3Mb_dup_poly_coords, 
  aes(x = poly_xs, y = poly_ys, fill = ID, group = ID),
  alpha = 2/5,
  show.legend = FALSE,
  color = "#1E90FF44",
#  color = "#696969",
  linewidth = 0.1,
  fill = "#1E90FF"
#  fill = "#808080"
)


p <- p + geom_polygon( 
  data = AH3Mb_BCMb_syn_poly_coords, 
  aes(x = poly_xs, y = poly_ys, fill = ID, group = ID),
  alpha = 2/5,
  show.legend = FALSE,
#  color = "#696969",
#  linewidth = 0.4,
  fill = "#808080"
)

p <- p + geom_polygon( 
  data = AH3Mb_BCMb_inv_poly_coords, 
  aes(x = poly_xs, y = poly_ys, fill = ID, group = ID),
  alpha = 2/5,
  show.legend = FALSE,
#  color = "#696969",
#  linewidth = 0.4,
  color = "#FFA500",
  fill = "#FFA500"
#  fill = "#808080"
)

p <- p + geom_polygon( 
  data = AH3Mb_BCMb_dup_poly_coords, 
  aes(x = poly_xs, y = poly_ys, fill = ID, group = ID),
  alpha = 2/5,
  show.legend = FALSE,
  color = "#1E90FF44",
#  color = "#696969",
  linewidth = 0.1,
  fill = "#1E90FF"
#  fill = "#808080"
)

#p
sex_chr <- p
sex_chr

Chrom by genes

my_data <- readr::read_csv("../Figure1b/gene_counts.csv", )
## Rows: 69 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): Sample
## dbl (11): chr1, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX, chrY
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# names(my_data)[1] <- "Sample"
# my_data$Sample <- as.factor( my_data$Sample )
my_data
## # A tibble: 69 × 12
##    Sample  chr1  chr2  chr3  chr4  chr5  chr6  chr7  chr8  chr9  chrX  chrY
##    <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 AH3Ma   4052  2763  2586  3189  2424  2535  2058  2892  2610  3689    NA
##  2 AH3Mb   4062  2807  2620  3224  2563  2604  2080  2967  2509    NA  2850
##  3 BCMa    4137  2878  2682  3256  2597  2599  2031  2953  2654  3792    NA
##  4 BCMb    4059  2703  2611  3098  2601  2615  1986  2905  2569    NA  2751
##  5 BOAXa   4097  2877  2680  3147  2444  2718  2015  2950  2612  3778    NA
##  6 BOAXb   4039  2816  2507  3159  2499  2586  1995  2914  2551  3670    NA
##  7 COFBa   4365  2681  2552  3035  2404  2457  1793  2840  2483  3625    NA
##  8 COFBb   4078  2960  2755  3300  2768  2750  2212  3058  2769  3944    NA
##  9 COSVa   4063  2834  2677  3130  2595  2580  2048  2940  2594  3722    NA
## 10 COSVb   4049  2846  2690  3175  2601  2645  2029  2977  2566  3801    NA
## # ℹ 59 more rows
library(tidyr)
data_long <- my_data %>%
  pivot_longer( cols = !Sample, names_to = "Chrom", values_to = "Count")
#data_long$Length <- data_long$Length / 1e6
gcount <- data_long
sum(is.na(data_long$Count))
## [1] 69
data_long <- data_long[!is.na(data_long$Count), ]
data_long
## # A tibble: 690 × 3
##    Sample Chrom Count
##    <chr>  <chr> <dbl>
##  1 AH3Ma  chr1   4052
##  2 AH3Ma  chr2   2763
##  3 AH3Ma  chr3   2586
##  4 AH3Ma  chr4   3189
##  5 AH3Ma  chr5   2424
##  6 AH3Ma  chr6   2535
##  7 AH3Ma  chr7   2058
##  8 AH3Ma  chr8   2892
##  9 AH3Ma  chr9   2610
## 10 AH3Ma  chrX   3689
## # ℹ 680 more rows
my_data <- readr::read_csv("../Figure1b/chrom_lengths.csv")
## New names:
## Rows: 69 Columns: 12
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 dbl (11): chr1, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX,
## chrY
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
names(my_data)[1] <- "Sample"
my_data$Sample <- as.factor( my_data$Sample )
my_data
## # A tibble: 69 × 12
##    Sample   chr1   chr2   chr3   chr4   chr5   chr6   chr7   chr8   chr9    chrX
##    <fct>   <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>   <dbl>
##  1 AH3Ma  6.57e7 7.65e7 8.20e7 8.16e7 7.65e7 7.63e7 6.64e7 5.41e7 6.60e7  8.42e7
##  2 AH3Mb  6.63e7 7.62e7 8.11e7 8.09e7 7.80e7 7.64e7 6.65e7 5.52e7 6.50e7 NA     
##  3 BCMa   6.51e7 7.58e7 8.62e7 8.12e7 7.67e7 7.53e7 6.55e7 5.48e7 6.49e7  8.33e7
##  4 BCMb   6.32e7 7.83e7 8.16e7 7.84e7 7.69e7 7.68e7 6.44e7 5.42e7 6.55e7 NA     
##  5 BOAXa  6.53e7 7.96e7 8.19e7 7.80e7 7.64e7 7.91e7 6.40e7 5.48e7 6.53e7  8.32e7
##  6 BOAXb  6.50e7 7.76e7 7.95e7 7.97e7 7.57e7 7.58e7 6.49e7 5.44e7 6.60e7  8.26e7
##  7 COFBa  7.21e7 7.85e7 8.44e7 8.21e7 7.76e7 7.97e7 6.37e7 5.57e7 6.55e7  8.57e7
##  8 COFBb  6.47e7 7.71e7 8.21e7 8.16e7 7.77e7 7.71e7 6.84e7 5.52e7 6.63e7  8.46e7
##  9 COSVa  6.76e7 8.34e7 8.63e7 8.52e7 8.09e7 7.93e7 6.99e7 5.53e7 6.81e7  8.65e7
## 10 COSVb  6.61e7 8.15e7 8.74e7 8.40e7 8.46e7 8.30e7 6.50e7 5.75e7 6.78e7  8.58e7
## # ℹ 59 more rows
## # ℹ 1 more variable: chrY <dbl>
library(tidyr)
data_long <- my_data %>%
  pivot_longer( cols = !Sample, names_to = "Chrom", values_to = "Length")
data_long$Length <- data_long$Length / 1e6
data_long
## # A tibble: 759 × 3
##    Sample Chrom Length
##    <fct>  <chr>  <dbl>
##  1 AH3Ma  chr1    65.7
##  2 AH3Ma  chr2    76.5
##  3 AH3Ma  chr3    82.0
##  4 AH3Ma  chr4    81.6
##  5 AH3Ma  chr5    76.5
##  6 AH3Ma  chr6    76.3
##  7 AH3Ma  chr7    66.4
##  8 AH3Ma  chr8    54.1
##  9 AH3Ma  chr9    66.0
## 10 AH3Ma  chrX    84.2
## # ℹ 749 more rows
glength <- data_long
glength
## # A tibble: 759 × 3
##    Sample Chrom Length
##    <fct>  <chr>  <dbl>
##  1 AH3Ma  chr1    65.7
##  2 AH3Ma  chr2    76.5
##  3 AH3Ma  chr3    82.0
##  4 AH3Ma  chr4    81.6
##  5 AH3Ma  chr5    76.5
##  6 AH3Ma  chr6    76.3
##  7 AH3Ma  chr7    66.4
##  8 AH3Ma  chr8    54.1
##  9 AH3Ma  chr9    66.0
## 10 AH3Ma  chrX    84.2
## # ℹ 749 more rows
gcount
## # A tibble: 759 × 3
##    Sample Chrom Count
##    <chr>  <chr> <dbl>
##  1 AH3Ma  chr1   4052
##  2 AH3Ma  chr2   2763
##  3 AH3Ma  chr3   2586
##  4 AH3Ma  chr4   3189
##  5 AH3Ma  chr5   2424
##  6 AH3Ma  chr6   2535
##  7 AH3Ma  chr7   2058
##  8 AH3Ma  chr8   2892
##  9 AH3Ma  chr9   2610
## 10 AH3Ma  chrX   3689
## # ℹ 749 more rows
data_long <- cbind(glength, gcount$Count)
names(data_long)[4] <- "Count"
data_long[1:3, ]
##   Sample Chrom   Length Count
## 1  AH3Ma  chr1 65.67137  4052
## 2  AH3Ma  chr2 76.48150  2763
## 3  AH3Ma  chr3 81.99513  2586
my_pal <- RColorBrewer::brewer.pal(n=12, name = "Paired")
#my_pal[11] <- "#B15928"
my_pal[11] <- "#C71585"
my_pal <- paste(my_pal, "88", sep = "")

library(ggplot2)
# Basic scatter plot
p <- ggplot(data_long, aes(x=Length, y=Count, color=Chrom)) 
p <- p + geom_point(size=2)
#p <- p + geom_smooth(method=lm)
p <- p + geom_smooth(method=lm, se=FALSE, linewidth = 1)

p <- p + theme_bw()
p <- p + theme(legend.title = element_blank()) 
#p <- p + theme(legend.position = "left")
p <- p + theme(legend.position = "right")
#p <- p + theme( legend.spacing.y = unit(1.0, 'mm') )
## important additional element
#p <- p + guides(fill = guide_legend(byrow = TRUE))
p + theme(legend.spacing.y = unit(1.0, 'mm')) +
  guides(fill = guide_legend(byrow = TRUE))
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 69 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 69 rows containing missing values (`geom_point()`).

#p <- p + scale_color_brewer(palette="Dark2")
#p <- p + scale_color_brewer(palette="Paired")
p <- p + scale_color_manual(values=my_pal)
#p <- p + scale_color_brewer(palette="Set3")
p <- p + xlab("Chromosome length (Mbp)")
p <- p + ylab("Genes per chromosome")

gchrom <- p
gchrom
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 69 rows containing non-finite values (`stat_smooth()`).
## Removed 69 rows containing missing values (`geom_point()`).

Figure 1 (Composite plot)

library("ggpubr")

ggarrange(
  plotlist = list(pEH23,
                  ggarrange(sex_chr, gchrom, 
                            ncol = 2, nrow = 1, 
                            widths = c(2, 3),
                            labels = c("B", "C"))
                  ),
  labels = c("A", ""),
  ncol = 1, nrow = 2, 
  widths = 1, heights = c(2, 1))
## Warning: Removed 69 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 69 rows containing missing values (`geom_point()`).
**Figure 1. Genomic architecture of the** ***Cannabis sativa*** **genome.**

Figure 1. Genomic architecture of the Cannabis sativa genome.

Allele frequencies

#afreq <- read.csv("/media/knausb/Vining_lab/knausb/GENOMES_NRQ_2267/ERBxHO40_hapERB/het_wins/allele_freqs.csv")
#
afreq <- read.csv("/media/knausb/Vining_lab/private_data/salk/VCFs/allele_freqs.csv")

afreq$CHROM[ afreq$CHROM == "chr_1e" ] <- "chr_1"
afreq$CHROM[ afreq$CHROM == "chr_2e" ] <- "chr_5"
afreq$CHROM[ afreq$CHROM == "chr_3e" ] <- "chr_2"
afreq$CHROM[ afreq$CHROM == "chr_4e" ] <- "chr_3"
afreq$CHROM[ afreq$CHROM == "chr_5e" ] <- "chr_4"
afreq$CHROM[ afreq$CHROM == "chr_6e" ] <- "chr_7"
afreq$CHROM[ afreq$CHROM == "chr_7e" ] <- "chr_8"
afreq$CHROM[ afreq$CHROM == "chr_8e" ] <- "chr_9"
afreq$CHROM[ afreq$CHROM == "chr_9e" ] <- "chr_6"
afreq$CHROM[ afreq$CHROM == "chr_Xe" ] <- "chr_X"
afreq$CHROM <- paste(afreq$CHROM, "e", sep = "")

unique(afreq$CHROM)
##  [1] "chr_1e" "chr_5e" "chr_2e" "chr_3e" "chr_4e" "chr_7e" "chr_8e" "chr_9e"
##  [9] "chr_6e" "chr_Xe"
afreq[1:3, ]
##    CHROM    POS   n Allele_counts        He       Ne count0.0 count0.1 count1.1
## 1 chr_1e  62350 270        509,31 0.1082236 1.121357      240       29        1
## 2 chr_1e  98473 270       412,128 0.3617010 1.566664      145      122        3
## 3 chr_1e 204226 270       263,277 0.4996639 1.998657       65      133       72
##   countNA ERBcount HO40count   ERBfreq   HO40freq       He2   POS2
## 1       0      509        31 0.9425926 0.05740741 0.1082236  62350
## 2       0      412       128 0.7629630 0.23703704 0.3617010  98473
## 3       0      263       277 0.4870370 0.51296296 0.4996639 204226
tmp <- afreq
afreq <- tmp[tmp$CHROM == "chr_1e", ]
afreq <- rbind( afreq, tmp[tmp$CHROM == "chr_2e", ])
afreq <- rbind( afreq, tmp[tmp$CHROM == "chr_3e", ])
afreq <- rbind( afreq, tmp[tmp$CHROM == "chr_4e", ])
afreq <- rbind( afreq, tmp[tmp$CHROM == "chr_5e", ])
afreq <- rbind( afreq, tmp[tmp$CHROM == "chr_6e", ])
afreq <- rbind( afreq, tmp[tmp$CHROM == "chr_7e", ])
afreq <- rbind( afreq, tmp[tmp$CHROM == "chr_8e", ])
afreq <- rbind( afreq, tmp[tmp$CHROM == "chr_9e", ])
afreq <- rbind( afreq, tmp[tmp$CHROM == "chr_Xe", ])
rm(tmp)
#
table(afreq$CHROM)
## 
## chr_1e chr_2e chr_3e chr_4e chr_5e chr_6e chr_7e chr_8e chr_9e chr_Xe 
##   2925   4855   5700   3175   3218   4685   2879   2309   1969   3359
#hist(afreq$countNA)
#afreq <- afreq[afreq$countNA <= 2, ]
#
afreq <- afreq[afreq$countNA <= 0, ]
afreq[1:3, ]
##    CHROM    POS   n Allele_counts        He       Ne count0.0 count0.1 count1.1
## 1 chr_1e  62350 270        509,31 0.1082236 1.121357      240       29        1
## 2 chr_1e  98473 270       412,128 0.3617010 1.566664      145      122        3
## 3 chr_1e 204226 270       263,277 0.4996639 1.998657       65      133       72
##   countNA ERBcount HO40count   ERBfreq   HO40freq       He2   POS2
## 1       0      509        31 0.9425926 0.05740741 0.1082236  62350
## 2       0      412       128 0.7629630 0.23703704 0.3617010  98473
## 3       0      263       277 0.4870370 0.51296296 0.4996639 204226

Fis

#plot(2 * afreq$ERBfreq * afreq$HO40freq, y =  afreq$He)
#plot(2 * afreq$ERBfreq * afreq$HO40freq, y =  afreq$He2)

#hist(afreq$He)

Hs <- 2 * afreq$ERBfreq * afreq$HO40freq
Ho <- afreq$count0.1 / afreq$n

#afreq$Fis <- (Hs - afreq$He)/Hs
afreq$Fis <- (Hs - Ho)/Hs
# afreq$Fis <- (afreq$Fis + 1)/2 # Scale from 0 - 1.

afreq[1:3, ]
##    CHROM    POS   n Allele_counts        He       Ne count0.0 count0.1 count1.1
## 1 chr_1e  62350 270        509,31 0.1082236 1.121357      240       29        1
## 2 chr_1e  98473 270       412,128 0.3617010 1.566664      145      122        3
## 3 chr_1e 204226 270       263,277 0.4996639 1.998657       65      133       72
##   countNA ERBcount HO40count   ERBfreq   HO40freq       He2   POS2          Fis
## 1       0      509        31 0.9425926 0.05740741 0.1082236  62350  0.007541669
## 2       0      412       128 0.7629630 0.23703704 0.3617010  98473 -0.249241505
## 3       0      263       277 0.4870370 0.51296296 0.4996639 204226  0.014152174
#hist(afreq$Fis)

# my_data$Fis <- (my_data$Hs - my_data$Ho)/my_data$Hs
#nuccomp <- read.csv("../FigureSideo/EH23a.softmasked_nuccomp.csv")
nuccomp <- read.csv("../FigureSideo/EH23b.softmasked_nuccomp.csv")
#
nuccomp <- nuccomp[sort.int(nuccomp$Id, index.return = TRUE)$ix, ]
nuccomp$chrom_num <- sub(".+chr", "", nuccomp$Id)
nuccomp$chrom_num[ nuccomp$chrom_num == "X" ] <- 10
nuccomp$chrom_num <- as.numeric(nuccomp$chrom_num)
nuccomp[1:3, ]
##           Id   Length        a       A        c       C        g       G
## 1 EH23b.chr1 63722626 14201549 7060523  7211614 3358522  7189944 3368219
## 3 EH23b.chr2 80157000 21489664 4919026 11451031 2314835 11452769 2317984
## 4 EH23b.chr3 83805000 22850264 4639830 12292599 2161091 12260323 2163874
##          t       T w W s S m M k K r R y Y n    N chrom_num
## 1 14242915 7080840 0 0 0 0 0 0 0 0 0 0 0 0 0 8500         1
## 3 21311082 4896609 0 0 0 0 0 0 0 0 0 0 0 0 0 4000         2
## 4 22785943 4646076 0 0 0 0 0 0 0 0 0 0 0 0 0 5000         3
nuccomp[, 1:2]
##            Id   Length
## 1  EH23b.chr1 63722626
## 3  EH23b.chr2 80157000
## 4  EH23b.chr3 83805000
## 5  EH23b.chr4 81133510
## 2  EH23b.chr5 78289000
## 9  EH23b.chr6 80012778
## 6  EH23b.chr7 64212500
## 7  EH23b.chr8 55650000
## 8  EH23b.chr9 65951000
## 10 EH23b.chrX 84401000
nuccomp$mids <- floor(nuccomp$Length/2)
nuccomp$Length2 <- 0
nuccomp$mids2 <- 0

nuccomp$Length2[1] <- nuccomp$Length[1]
nuccomp$mids2[1] <- nuccomp$mids[1]

for(i in 2:nrow(nuccomp)){
  nuccomp$Length2[i] <- nuccomp$Length2[ i-1 ] + nuccomp$Length[ i ]
  nuccomp$mids2[i] <- nuccomp$Length2[ i-1 ] + nuccomp$mids[i]
}
nuccomp$CHROM <- sub("^EH23a.", "", nuccomp$Id)
nuccomp$CHROM <- sub("chr", "chr_", nuccomp$CHROM)
nuccomp$CHROM <- paste(nuccomp$CHROM, "e", sep = "")

nuccomp[1:3, ]
##           Id   Length        a       A        c       C        g       G
## 1 EH23b.chr1 63722626 14201549 7060523  7211614 3358522  7189944 3368219
## 3 EH23b.chr2 80157000 21489664 4919026 11451031 2314835 11452769 2317984
## 4 EH23b.chr3 83805000 22850264 4639830 12292599 2161091 12260323 2163874
##          t       T w W s S m M k K r R y Y n    N chrom_num     mids   Length2
## 1 14242915 7080840 0 0 0 0 0 0 0 0 0 0 0 0 0 8500         1 31861313  63722626
## 3 21311082 4896609 0 0 0 0 0 0 0 0 0 0 0 0 0 4000         2 40078500 143879626
## 4 22785943 4646076 0 0 0 0 0 0 0 0 0 0 0 0 0 5000         3 41902500 227684626
##       mids2        CHROM
## 1  31861313 EH23b.chr_1e
## 3 103801126 EH23b.chr_2e
## 4 185782126 EH23b.chr_3e

rePOS

# afreq$POS2 <- 0
# 
# for( i in 1:nrow(nuccomp) ){
# #  afreq$POS2[ afreq$CHROM == EH23a$Id[i] ] <- afreq$POS[ afreq$CHROM == EH23a$Id[i] ] + (EH23a$Length2[i] - EH23a$Length[i])
#   afreq$POS2[ afreq$CHROM == nuccomp$CHROM[i] ] <- afreq$POS[ afreq$CHROM == nuccomp$CHROM[i] ] + (nuccomp$Length2[i] - nuccomp$Length[i])
# }
# # EH23a$Id
# # EH23a$Length2 - EH23a$Length
# afreq[1:3, ]
EH23a <- afreq[, c("CHROM", "POS", "n", "Allele_counts", "He", "Ne", "count0.0", 
"count0.1", "count1.1", "countNA", "ERBcount", "HO40count", "ERBfreq", 
"HO40freq", "He2", "POS2")]
EH23a$Frequency <- EH23a$ERBfreq
EH23a$facet1 <- "EH23a"

EH23b <- afreq[, c("CHROM", "POS", "n", "Allele_counts", "He", "Ne", "count0.0", 
"count0.1", "count1.1", "countNA", "ERBcount", "HO40count", "ERBfreq", 
"HO40freq", "He2", "POS2")]
EH23b$Frequency <- EH23a$HO40freq
EH23b$facet1 <- "EH23b"

Het <- afreq[, c("CHROM", "POS", "n", "Allele_counts", "He", "Ne", "count0.0", 
"count0.1", "count1.1", "countNA", "ERBcount", "HO40count", "ERBfreq", 
"HO40freq", "He2", "POS2")]
Het$Frequency <- EH23a$He2
Het$facet1 <- "Het"
Het[1:3, ]
##    CHROM    POS   n Allele_counts        He       Ne count0.0 count0.1 count1.1
## 1 chr_1e  62350 270        509,31 0.1082236 1.121357      240       29        1
## 2 chr_1e  98473 270       412,128 0.3617010 1.566664      145      122        3
## 3 chr_1e 204226 270       263,277 0.4996639 1.998657       65      133       72
##   countNA ERBcount HO40count   ERBfreq   HO40freq       He2   POS2 Frequency
## 1       0      509        31 0.9425926 0.05740741 0.1082236  62350 0.1082236
## 2       0      412       128 0.7629630 0.23703704 0.3617010  98473 0.3617010
## 3       0      263       277 0.4870370 0.51296296 0.4996639 204226 0.4996639
##   facet1
## 1    Het
## 2    Het
## 3    Het
Fis_df <- afreq[, c("CHROM", "POS", "n", "Allele_counts", "He", "Ne", "count0.0", 
"count0.1", "count1.1", "countNA", "ERBcount", "HO40count", "ERBfreq", 
"HO40freq", "He2", "POS2")]
#Fis_df$Frequency <- Fis
Fis_df$Frequency <- afreq$Fis
Fis_df$facet1 <- "Fis"
#Fis_df[1:3, ]
#nrow(Fis_df)

# ncol(EH23a)
# ncol(EH23b)
# ncol(Het)
# ncol(Fis_df)
# Fis_df[1:3, ]
#
afreq1 <- rbind(EH23a, EH23b, Het)
#afreq <- rbind(EH23a, EH23b, Het, Fis_df)

#rbind(EH23a, EH23b, Fis)

afreq1$facet2 <- afreq1$facet1
afreq1$facet2 <- sub("EH23[ab]", "EH23", afreq1$facet2)
afreq1$facet2[ afreq1$facet2 == "EH23" ] <- "Allele"
afreq1$facet2[ afreq1$facet2 == "Het" ] <- "Heterozygosity"
table(afreq1$facet2)
## 
##         Allele Heterozygosity 
##          70148          35074
afreq1[1:3, ]
##    CHROM    POS   n Allele_counts        He       Ne count0.0 count0.1 count1.1
## 1 chr_1e  62350 270        509,31 0.1082236 1.121357      240       29        1
## 2 chr_1e  98473 270       412,128 0.3617010 1.566664      145      122        3
## 3 chr_1e 204226 270       263,277 0.4996639 1.998657       65      133       72
##   countNA ERBcount HO40count   ERBfreq   HO40freq       He2   POS2 Frequency
## 1       0      509        31 0.9425926 0.05740741 0.1082236  62350 0.9425926
## 2       0      412       128 0.7629630 0.23703704 0.3617010  98473 0.7629630
## 3       0      263       277 0.4870370 0.51296296 0.4996639 204226 0.4870370
##   facet1 facet2
## 1  EH23a Allele
## 2  EH23a Allele
## 3  EH23a Allele
library(ggplot2)

#my_pal <- viridisLite::inferno(n = 10, begin = 0.2, end = 0.9, alpha = 0.01)
my_pal <- viridisLite::inferno(n = 10, begin = 0.2, end = 0.9, alpha = 0.04)
set.seed(99)
my_pal <- my_pal[sample(1:length(my_pal), size = length(my_pal))]
#palette(my_pal)

#as.numeric(as.factor(afreq$CHROM))

#afreq$col2 <- viridisLite::magma( n = 10, alpha = 0.8, begin = 0, end = 1 )[as.numeric(as.factor(afreq$CHROM))]

# range(afreq$POS2)

p <- ggplot( data = afreq1, 
             mapping = aes( x = POS2, 
                            #y = ERBfreq,
                            #y = freq,
                            y = Frequency,
                            color = CHROM )
                            #color = col2)
             )
p <- p + geom_point( shape = 20, show.legend = FALSE )
p <- p + theme_bw()
p <- p + theme( axis.title.x=element_blank(),
                axis.text.x = element_text(angle = 60, hjust = 1)
#                plot.margin = unit(c(0.1,0.1,1,0.1), "cm")
                )
#p <- p + theme(axis.title.x = element_blank(), 
#               axis.text.x = element_text( angle = 60, hjust = 1))
# p + scale_x_discrete(breaks = EH23a$mids2, labels = EH23a$Id)
# p + scale_x_discrete(breaks = EH23a$mids2)
p <- p + ggplot2::scale_x_continuous( 
  breaks = nuccomp$mids2,
  expand = expansion(mult = 0.01, add = 0.0),
#  expand = expansion(mult = 0.02, add = 0.0),
  labels = sub("a.chr", ".chr", nuccomp$Id)
)
p <- p + geom_vline( xintercept = nuccomp$Length2, linetype = 1, color = "#808080", linewidth = 0.4 )
p <- p + geom_vline( xintercept = 0, linetype = 1, color = "#808080", linewidth = 0.4 )
#p <- p + geom_hline( yintercept = 0.5, linetype = 1, color = "#000000", linewidth = 1 )
p <- p + geom_hline( yintercept = c(0, 0.5), linetype = 1, color = "#000000", linewidth = 1 )


#p + scale_color_manual( values = viridisLite::magma( n = 10, alpha = 0.01, begin = 0.2, end = 0.9 ) )
#p <- p + xlim(100, sum(nuccomp$Length))

#p <- p + ylim(0.3, 0.7)
p <- p + scale_color_manual( values = my_pal )
#
#p <- 
p + facet_grid(facet1 ~ .)

#p <- 
ahplot <- p + facet_grid(facet2 ~ . , scales = "free_y")
#p
ahplot

#p + scale_color_manual( values = afreq$col2 )
#+ scale_color_viridis_b()

Figure 1 (Composite plot)

library("ggpubr")

ggarrange(
  plotlist = list(pEH23, ahplot),
  labels = c("A", "B"),
  ncol = 1, nrow = 2, 
  widths = 1, heights = c(2, 1))
**Figure 1. Genomic architecture of the** ***Cannabis sativa*** **genome.**

Figure 1. Genomic architecture of the Cannabis sativa genome.

n <- 270

#my_data <- data.frame( p = seq(from = 0.5, to = 1.0, by = 0.1) )
my_data <- data.frame( p = seq(from = 0.5, to = 0.6, by = 0.1) )
my_data$q <- 1 - my_data$p
#
my_data$f <- 0
#my_data$f <- -0.25
#my_data$f <- 0.04

my_data$AA <- (my_data$p^2 * (1 - my_data$f)) + (my_data$p * my_data$f)
my_data$Aa <- 2 * my_data$p * my_data$q * (1 - my_data$f)
my_data$aa <- (my_data$q^2 * (1 - my_data$f)) + (my_data$q * my_data$f)
my_data$Hexcess <- my_data$Aa - (2 * my_data$p * my_data$q)

my_data$Hs <- 2 * my_data$p * my_data$q
my_data$Ho <- my_data$Aa
my_data$Fis <- (my_data$Hs - my_data$Ho)/my_data$Hs

my_data$nAA <- my_data$AA * n
my_data$nAa <- my_data$Aa * n
my_data$naa <- my_data$aa * n
my_data$nHexcess <- my_data$Hexcess * n

knitr::kable(my_data)
p q f AA Aa aa Hexcess Hs Ho Fis nAA nAa naa nHexcess
0.5 0.5 0 0.25 0.50 0.25 0 0.50 0.50 0 67.5 135.0 67.5 0
0.6 0.4 0 0.36 0.48 0.16 0 0.48 0.48 0 97.2 129.6 43.2 0
n <- 270

#my_data <- data.frame( p = seq(from = 0.5, to = 1.0, by = 0.1) )
my_data <- data.frame( p = seq(from = 0.5, to = 0.6, by = 0.1) )
my_data$q <- 1 - my_data$p
#
my_data$f <- -0.25
#my_data$f <- 0.04

my_data$AA <- (my_data$p^2 * (1 - my_data$f)) + (my_data$p * my_data$f)
my_data$Aa <- 2 * my_data$p * my_data$q * (1 - my_data$f)
my_data$aa <- (my_data$q^2 * (1 - my_data$f)) + (my_data$q * my_data$f)
my_data$Hexcess <- my_data$Aa - (2 * my_data$p * my_data$q)

my_data$Hs <- 2 * my_data$p * my_data$q
my_data$Ho <- my_data$Aa
my_data$Fis <- (my_data$Hs - my_data$Ho)/my_data$Hs

my_data$nAA <- my_data$AA * n
my_data$nAa <- my_data$Aa * n
my_data$naa <- my_data$aa * n
my_data$nHexcess <- my_data$Hexcess * n

knitr::kable(my_data)
p q f AA Aa aa Hexcess Hs Ho Fis nAA nAa naa nHexcess
0.5 0.5 -0.25 0.1875 0.625 0.1875 0.125 0.50 0.625 -0.25 50.625 168.75 50.625 33.75
0.6 0.4 -0.25 0.3000 0.600 0.1000 0.120 0.48 0.600 -0.25 81.000 162.00 27.000 32.40

Fis

Filter Alleles.

afreq[1:3, ]
##    CHROM    POS   n Allele_counts        He       Ne count0.0 count0.1 count1.1
## 1 chr_1e  62350 270        509,31 0.1082236 1.121357      240       29        1
## 2 chr_1e  98473 270       412,128 0.3617010 1.566664      145      122        3
## 3 chr_1e 204226 270       263,277 0.4996639 1.998657       65      133       72
##   countNA ERBcount HO40count   ERBfreq   HO40freq       He2   POS2          Fis
## 1       0      509        31 0.9425926 0.05740741 0.1082236  62350  0.007541669
## 2       0      412       128 0.7629630 0.23703704 0.3617010  98473 -0.249241505
## 3       0      263       277 0.4870370 0.51296296 0.4996639 204226  0.014152174
afreq <- afreq[afreq$ERBfreq >= 0.3, ]
afreq <- afreq[afreq$ERBfreq <= 0.7, ]
afreq <- afreq[afreq$HO40freq >= 0.3, ]
afreq <- afreq[afreq$HO40freq <= 0.7, ]
afreq <- afreq[afreq$He2 >= 0.25, ]
afreq <- afreq[afreq$He2 <= 0.75, ]
EH23a <- afreq[, c("CHROM", "POS", "n", "Allele_counts", "He", "Ne", "count0.0", 
"count0.1", "count1.1", "countNA", "ERBcount", "HO40count", "ERBfreq", 
"HO40freq", "He2", "POS2")]
EH23a$Frequency <- EH23a$ERBfreq
EH23a$facet1 <- "EH23a"

EH23b <- afreq[, c("CHROM", "POS", "n", "Allele_counts", "He", "Ne", "count0.0", 
"count0.1", "count1.1", "countNA", "ERBcount", "HO40count", "ERBfreq", 
"HO40freq", "He2", "POS2")]
EH23b$Frequency <- EH23a$HO40freq
EH23b$facet1 <- "EH23b"

Het <- afreq[, c("CHROM", "POS", "n", "Allele_counts", "He", "Ne", "count0.0", 
"count0.1", "count1.1", "countNA", "ERBcount", "HO40count", "ERBfreq", 
"HO40freq", "He2", "POS2")]
Het$Frequency <- EH23a$He2
Het$facet1 <- "Het"
Het[1:3, ]
##    CHROM    POS   n Allele_counts        He       Ne count0.0 count0.1 count1.1
## 3 chr_1e 204226 270       263,277 0.4996639 1.998657       65      133       72
## 4 chr_1e 222275 270       270,270 0.5000000 2.000000       63      144       63
## 5 chr_1e 249752 270       263,277 0.4996639 1.998657       61      141       68
##   countNA ERBcount HO40count  ERBfreq HO40freq       He2   POS2 Frequency
## 3       0      263       277 0.487037 0.512963 0.4996639 204226 0.4996639
## 4       0      270       270 0.500000 0.500000 0.5000000 222275 0.5000000
## 5       0      263       277 0.487037 0.512963 0.4996639 249752 0.4996639
##   facet1
## 3    Het
## 4    Het
## 5    Het
Fis_df <- afreq[, c("CHROM", "POS", "n", "Allele_counts", "He", "Ne", "count0.0", 
"count0.1", "count1.1", "countNA", "ERBcount", "HO40count", "ERBfreq", 
"HO40freq", "He2", "POS2")]
#Fis_df$Frequency <- Fis
Fis_df$Frequency <- afreq$Fis
Fis_df$facet1 <- "Fis"
#Fis_df[1:3, ]
#nrow(Fis_df)

# ncol(EH23a)
# ncol(EH23b)
# ncol(Het)
# ncol(Fis_df)
# Fis_df[1:3, ]
#afreq1 <- rbind(EH23a, EH23b, Het)
#
#
afreq <- rbind(EH23a, EH23b, Het, Fis_df)
afreq$facet2 <- afreq$facet1
afreq$facet2 <- sub("EH23[ab]", "EH23", afreq$facet2)
afreq$facet2[ afreq$facet2 == "EH23" ] <- "Allele"
afreq$facet2[ afreq$facet2 == "Het" ] <- "Heterozygosity"
table(afreq$facet2)
## 
##         Allele            Fis Heterozygosity 
##          46118          23059          23059
afreq[1:3, ]
##    CHROM    POS   n Allele_counts        He       Ne count0.0 count0.1 count1.1
## 3 chr_1e 204226 270       263,277 0.4996639 1.998657       65      133       72
## 4 chr_1e 222275 270       270,270 0.5000000 2.000000       63      144       63
## 5 chr_1e 249752 270       263,277 0.4996639 1.998657       61      141       68
##   countNA ERBcount HO40count  ERBfreq HO40freq       He2   POS2 Frequency
## 3       0      263       277 0.487037 0.512963 0.4996639 204226  0.487037
## 4       0      270       270 0.500000 0.500000 0.5000000 222275  0.500000
## 5       0      263       277 0.487037 0.512963 0.4996639 249752  0.487037
##   facet1 facet2
## 3  EH23a Allele
## 4  EH23a Allele
## 5  EH23a Allele
library(ggplot2)

table(afreq$facet2)
## 
##         Allele            Fis Heterozygosity 
##          46118          23059          23059
afreq <- afreq[afreq$facet2 != "Heterozygosity", ]


#my_pal <- viridisLite::inferno(n = 10, begin = 0.2, end = 0.9, alpha = 0.01)
my_pal <- viridisLite::inferno(n = 10, begin = 0.2, end = 0.9, alpha = 0.04)
set.seed(99)
my_pal <- my_pal[sample(1:length(my_pal), size = length(my_pal))]
#palette(my_pal)

#as.numeric(as.factor(afreq$CHROM))

#afreq$col2 <- viridisLite::magma( n = 10, alpha = 0.8, begin = 0, end = 1 )[as.numeric(as.factor(afreq$CHROM))]

# range(afreq$POS2)

p <- ggplot( data = afreq, 
             mapping = aes( x = POS2, 
                            #y = ERBfreq,
                            #y = freq,
                            y = Frequency,
                            color = CHROM )
                            #color = col2)
             )
p <- p + geom_point( shape = 20, show.legend = FALSE )
p <- p + theme_bw()
p <- p + theme( axis.title.x=element_blank(),
                axis.text.x = element_text(angle = 60, hjust = 1)
#                plot.margin = unit(c(0.1,0.1,1,0.1), "cm")
                )
#p <- p + theme(axis.title.x = element_blank(), 
#               axis.text.x = element_text( angle = 60, hjust = 1))
# p + scale_x_discrete(breaks = EH23a$mids2, labels = EH23a$Id)
# p + scale_x_discrete(breaks = EH23a$mids2)
p <- p + ggplot2::scale_x_continuous( 
  breaks = nuccomp$mids2,
  expand = expansion(mult = 0.01, add = 0.0),
#  expand = expansion(mult = 0.02, add = 0.0),
  labels = sub("a.chr", ".chr", nuccomp$Id)
)
p <- p + theme( panel.grid.major.x = element_line(color = "#A9A9A9", linewidth = 0.0 ) )
p <- p + theme( panel.grid.minor.x = element_line(color = "#A9A9A9", linewidth = 0.0 ) )

# p <- p + geom_vline( xintercept = nuccomp$Length2, linetype = 1, color = "#808080", linewidth = 0.4 )
# p <- p + geom_vline( xintercept = 0, linetype = 1, color = "#808080", linewidth = 0.4 )
p <- p + geom_vline( xintercept = nuccomp$Length2, linetype = 5, color = "#808080", linewidth = 0.4 )
p <- p + geom_vline( xintercept = 0, linetype = 5, color = "#808080", linewidth = 0.4 )

#p <- p + geom_hline( yintercept = 0.5, linetype = 1, color = "#000000", linewidth = 1 )
#p <- p + geom_hline( yintercept = c(0, 0.5), linetype = 1, color = "#000000", linewidth = 1 )


#p + scale_color_manual( values = viridisLite::magma( n = 10, alpha = 0.01, begin = 0.2, end = 0.9 ) )
#p <- p + xlim(100, sum(nuccomp$Length))

#p <- p + ylim(0.3, 0.7)
p <- p + scale_color_manual( values = my_pal )
#
#p <- p + facet_grid(facet1 ~ .)
#p <- 
#ahplot + ylab("")

ahplot <- ahplot + scale_y_continuous( breaks = seq(-1.0, 1.0, by = 0.2),
                  minor_breaks = seq(-1, 1, by = 0.1) )

ahplot <- p + facet_grid(facet2 ~ . , scales = "free_y")
#ahplot <-   ahplot + theme( panel.grid.major.y = element_line(color = "#696969", linewidth = 0.6 ) )
ahplot <- ahplot + theme( panel.grid.major.y = element_line(color = "#A9A9A9", linewidth = 0.4 ) )
#ahplot + theme( panel.grid.major.y = element_line(color = "#C0C0C0", linewidth = 0.2, linetype = 1 ) )

#  myPlots[[i]] <- myPlots[[i]] + theme( panel.grid.minor.y=element_line(color = "#C0C0C0", size=0.2) )
ahplot <- ahplot + ylab(NULL)


#p
ahplot

#hline_dat <- data.frame( threshold = c(0.5, 0.0) )
#ahplot + geom_hline(data=hline_dat, aes(yintercept=threshold), colour="salmon")
 
#p + scale_color_manual( values = afreq$col2 )
#+ scale_color_viridis_b()
library("ggpubr")

ggarrange(
  plotlist = list(pEH23, ahplot),
  labels = c("A", "B"),
  ncol = 1, nrow = 2, 
  widths = 1, heights = c(2, 1))
**Figure 1. Genomic architecture of the** ***Cannabis sativa*** **genome.**

Figure 1. Genomic architecture of the Cannabis sativa genome.

# ggsave( filename = "EH23_chroms_freqs.tiff", 
#         device = "tiff", 
#         width = 6.5, height = 9, units = "in", 
#         dpi = 300,
#         compression = "lzw")
t99 <- Sys.time()
t99 - t1
## Time difference of 4.071925 mins